-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][CodeGen][True16] Track waitcnt as vgpr32 instead of vgpr16 for D16 Instructions in GFX11 #157795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][CodeGen][True16] Track waitcnt as vgpr32 instead of vgpr16 for D16 Instructions in GFX11 #157795
Conversation
f9550ce
to
39195c2
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesIt seems the VMEM access on hi/lo half could interfere the other half. Track waitcnt of vgpr32 instead of vgpr16 for 16bit reg in GFX11. Patch is 90.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157795.diff 15 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a366db1c580ba..74fd7d543d42c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -586,6 +586,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;
+def Feature16bitD16HWBug : SubtargetFeature<"d16-hw-bug",
+ "Enable16bitD16HWBug",
+ "true",
+ "D16 for 16 bit data type interfere the other half in true16 mode"
+>;
+
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
"HasBF16TransInsts",
"true",
@@ -1934,7 +1940,9 @@ def FeatureISAVersion11_Common : FeatureSet<
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureMemoryAtomicFAddF32DenormalSupport,
- FeatureRealTrue16Insts]>;
+ FeatureRealTrue16Insts,
+ Feature16bitD16HWBug,
+]>;
// There are few workarounds that need to be
// added to all targets. This pessimizes codegen
@@ -2570,6 +2578,13 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
+// Do not use D16 inst for 16bit data type
+def Has16bitD16HWBug: Predicate<"Subtarget->has16bitD16HWBug()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, Feature16bitD16HWBug)>;
+def NotHas16bitD16HWBug: Predicate<"Subtarget->useRealTrue16Insts() && "
+ "!Subtarget->has16bitD16HWBug()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts, (not Feature16bitD16HWBug))>;
+
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 73acb1ddbd2a7..521cd208f5326 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -38,6 +38,10 @@ bool AMDGPUSubtarget::useRealTrue16Insts() const {
return hasTrue16BitInsts() && EnableRealTrue16Insts;
}
+bool AMDGPUSubtarget::has16bitD16HWBug() const {
+ return hasTrue16BitInsts() && useRealTrue16Insts() && Enable16bitD16HWBug;
+}
+
// Returns the maximum per-workgroup LDS allocation size (in bytes) that still
// allows the given function to achieve an occupancy of NWaves waves per
// SIMD / EU, taking into account only the function's *maximum* workgroup size.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 57b757c990e1a..e5203486436e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
+ bool Enable16bitD16HWBug = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasBF16PackedInsts = false;
@@ -224,6 +225,8 @@ class AMDGPUSubtarget {
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;
+ bool has16bitD16HWBug() const;
+
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
bool hasBF16ConversionInsts() const {
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index b163a274396ff..db977cacbaebd 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -845,6 +845,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
assert(Size % 16 == 0);
Result.second = Result.first + (Size / 16);
+
+ if (Size == 16 && Context->ST->has16bitD16HWBug()) {
+ // also update the other half since lo16/hi16 interfere with each other
+ if (AMDGPU::isHi16Reg(MCReg, *TRI))
+ Result.first -= 1;
+ else
+ Result.second += 1;
+ }
} else if (TRI->isSGPRReg(*MRI, Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
// SGPRs including VCC, TTMPs and EXEC but excluding read-only scalar
// sources like SRC_PRIVATE_BASE.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 46b82d3a3d651..1ce7179774349 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -15503,59 +15503,37 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -52226,59 +52204,37 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -87002,59 +86958,37 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -121707,59 +121641,37 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41)
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
@@ -147524,6 +147436,7 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v30.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v26.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v24.l
@@ -147555,7 +147468,6 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v27.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v29.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v150.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v150.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v147.h
@@ -147572,69 +147484,37 @@ define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v144.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v144.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v135.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v132.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v135.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(57)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v134.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(55)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v133.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v160
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v103.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v113.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v113.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v114.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v114.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v115.h
-; GFX11-TRUE...
[truncated]
|
39195c2
to
962bd62
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please change commit title to |
Co-authored-by: Joe Nash <joseph.nash@amd.com>
Co-authored-by: Joe Nash <joseph.nash@amd.com>
Co-authored-by: Joe Nash <joseph.nash@amd.com>
Co-authored-by: Joe Nash <joseph.nash@amd.com>
✅ With the latest revision this PR passed the C/C++ code formatter. |
34b40f9
to
5ed202c
Compare
llvm/lib/Target/AMDGPU/AMDGPU.td
Outdated
"Use true 16-bit registers" | ||
>; | ||
|
||
def Feature16bitD16HWBug : SubtargetFeature<"d16-hw-bug", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer to find a more descriptive name for the feature. The symptom is that for waitcnt insertion purposes you need to treat D16 loads as if they write to a full 32-bit VGPR, right? So maybe something like "D16Writes32BitVgpr" or "D16LoadsWriteFullVgpr"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I personally like "D16Writes32BitVgpr". Updated
60ebe2f
to
60315b0
Compare
It seems the VMEM access on hi/lo half could interfere the other half. Track waitcnt of vgpr32 instead of vgpr16 for 16bit reg in GFX11.